{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get metadata from Dataverse for each DOI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests as req\n",
    "import json\n",
    "import math\n",
    "import os\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from string import Template\n",
    "query_base=Template('https://dataverse.harvard.edu/api/search?q=\"${doi}\"&show_facets=true')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas\n",
    "unique_dois = pandas.read_csv(\"../get-dois/dataset_dois.txt\", names=['DOI'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DOI</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>doi:10.7910/DVN/U3QJQZ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>doi:10.7910/DVN/Z7H44N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>doi:10.7910/DVN/HRLHA4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>doi:10.7910/DVN/RJWU7A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>doi:10.7910/DVN/FAAMAX</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      DOI\n",
       "0  doi:10.7910/DVN/U3QJQZ\n",
       "1  doi:10.7910/DVN/Z7H44N\n",
       "2  doi:10.7910/DVN/HRLHA4\n",
       "3  doi:10.7910/DVN/RJWU7A\n",
       "4  doi:10.7910/DVN/FAAMAX"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_dois.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_dois = unique_dois['DOI'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2170"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(unique_dois)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = query_base.substitute(doi = unique_dois[0])\n",
    "res = req.get(query)\n",
    "res_dict = json.loads(res.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2020'"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time = str(res_dict['data']['items'][0]['published_at'][:4])\n",
    "time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_metadata=\"\"\n",
    "all_metadata = ','.join([\"DOI\",\"publicationDate\",\"publisher\",\"subject\"]) + os.linesep\n",
    "for i,doi in enumerate(unique_dois):\n",
    "        query = query_base.substitute(doi = doi)\n",
    "\n",
    "        res = req.get(query)\n",
    "        res_dict = json.loads(res.text)\n",
    "        try:\n",
    "            time = res_dict['data']['items'][0]['published_at'][:4]\n",
    "        except IndexError:\n",
    "            print res_dict['data']['items']\n",
    "            \n",
    "        try:\n",
    "            subs = res_dict['data']['facets'][0]['subject_ss']['labels']\n",
    "            subject = \";\".join(str(s.keys()[0]) for s in subs)\n",
    "        except KeyError:\n",
    "            subs = \"NA\"\n",
    "        \n",
    "        try:\n",
    "            publisher = res_dict['data']['items'][0]['identifier_of_dataverse']\n",
    "        except:\n",
    "            print doi\n",
    "            print res_dict['data']\n",
    "            \n",
    "        metadata_line = ','.join([doi.strip(), time, publisher.replace(\",\",\";\"), subject.replace(\",\",\";\")])\n",
    "        #print metadata_line\n",
    "        all_metadata += metadata_line + os.linesep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>DOI</th>\n",
       "      <th>publicationDate</th>\n",
       "      <th>publisher</th>\n",
       "      <th>subject</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>doi:10.7910/DVN/U3QJQZ</td>\n",
       "      <td>2020</td>\n",
       "      <td>harvard</td>\n",
       "      <td>Medicine; Health and Life Sciences</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>doi:10.7910/DVN/Z7H44N</td>\n",
       "      <td>2017</td>\n",
       "      <td>PSRM</td>\n",
       "      <td>Social Sciences</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>doi:10.7910/DVN/HRLHA4</td>\n",
       "      <td>2016</td>\n",
       "      <td>pan</td>\n",
       "      <td>Mathematical Sciences;Social Sciences</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>doi:10.7910/DVN/RJWU7A</td>\n",
       "      <td>2015</td>\n",
       "      <td>intertransferuschina</td>\n",
       "      <td>Social Sciences</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>doi:10.7910/DVN/FAAMAX</td>\n",
       "      <td>2018</td>\n",
       "      <td>BLS-PNAS</td>\n",
       "      <td>Social Sciences</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      DOI  publicationDate             publisher  \\\n",
       "0  doi:10.7910/DVN/U3QJQZ             2020               harvard   \n",
       "1  doi:10.7910/DVN/Z7H44N             2017                  PSRM   \n",
       "2  doi:10.7910/DVN/HRLHA4             2016                   pan   \n",
       "3  doi:10.7910/DVN/RJWU7A             2015  intertransferuschina   \n",
       "4  doi:10.7910/DVN/FAAMAX             2018              BLS-PNAS   \n",
       "\n",
       "                                 subject  \n",
       "0     Medicine; Health and Life Sciences  \n",
       "1                        Social Sciences  \n",
       "2  Mathematical Sciences;Social Sciences  \n",
       "3                        Social Sciences  \n",
       "4                        Social Sciences  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd \n",
    "from StringIO import StringIO\n",
    "\n",
    "df = pd.read_csv(StringIO(all_metadata))\n",
    "#df = pd.read_csv(StringIO(all_metadata), delimiter=',')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "285"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('data/all_metadata.txt','w') as f:\n",
    "    f.write(all_metadata.encode('utf8'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get publisher metadata from Dataverse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>publicationDate</th>\n      <th>publisher</th>\n      <th>subject</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/U3QJQZ</td>\n      <td>2020</td>\n      <td>harvard</td>\n      <td>Medicine; Health and Life Sciences</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/HRLHA4</td>\n      <td>2016</td>\n      <td>pan</td>\n      <td>Mathematical Sciences;Social Sciences</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/RJWU7A</td>\n      <td>2015</td>\n      <td>intertransferuschina</td>\n      <td>Social Sciences</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/FAAMAX</td>\n      <td>2018</td>\n      <td>BLS-PNAS</td>\n      <td>Social Sciences</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/PAHRCK</td>\n      <td>2019</td>\n      <td>internationalinteractions</td>\n      <td>Social Sciences</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                      doi  publicationDate                  publisher  \\\n0  doi:10.7910/DVN/U3QJQZ             2020                    harvard   \n1  doi:10.7910/DVN/HRLHA4             2016                        pan   \n2  doi:10.7910/DVN/RJWU7A             2015       intertransferuschina   \n3  doi:10.7910/DVN/FAAMAX             2018                   BLS-PNAS   \n4  doi:10.7910/DVN/PAHRCK             2019  internationalinteractions   \n\n                                 subject  \n0     Medicine; Health and Life Sciences  \n1  Mathematical Sciences;Social Sciences  \n2                        Social Sciences  \n3                        Social Sciences  \n4                        Social Sciences  "
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "metadata = pd.read_csv(\"data/all_metadata.txt\")\n",
    "metadata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "array(['harvard', 'pan', 'intertransferuschina', 'BLS-PNAS',\n       'internationalinteractions', 'isq', 'monogan', 'govdept', 'restat',\n       'palcomms'], dtype=object)"
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique = metadata['publisher'].unique()\n",
    "unique[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "{u'status': u'ERROR', u'message': u\"Can't find dataverse with identifier='palcomms'\"}\n{u'status': u'ERROR', u'message': u\"Can't find dataverse with identifier='SPSRleemann2015'\"}\n"
    }
   ],
   "source": [
    "from string import Template\n",
    "import requests as req\n",
    "import json\n",
    "import os\n",
    "\n",
    "qbase=Template('https://dataverse.harvard.edu/api/dataverses/${iddv}')\n",
    "\n",
    "all_journals = \"\"\n",
    "for i,iddv in enumerate(unique):\n",
    "    query = qbase.substitute(iddv = iddv)\n",
    "    res = req.get(query)\n",
    "    res_dict = json.loads(res.text)\n",
    "    \n",
    "    try:\n",
    "        name = res_dict['data']['name']\n",
    "        dvtype = res_dict['data']['dataverseType']\n",
    "        \n",
    "        metadata_line = '\\t'.join([iddv.strip(), dvtype, name])\n",
    "        #print metadata_line\n",
    "        all_journals += metadata_line + os.linesep\n",
    "    except:\n",
    "        print res_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>publisher</th>\n      <th>type</th>\n      <th>name</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>harvard</td>\n      <td>ORGANIZATIONS_INSTITUTIONS</td>\n      <td>Harvard Dataverse</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>pan</td>\n      <td>JOURNALS</td>\n      <td>Political Analysis Dataverse</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>intertransferuschina</td>\n      <td>RESEARCH_PROJECTS</td>\n      <td>Behind Parent-Child Relationship and Intergene...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>BLS-PNAS</td>\n      <td>RESEARCH_PROJECTS</td>\n      <td>Bechtel Liesch Scheve PNAS</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>internationalinteractions</td>\n      <td>JOURNALS</td>\n      <td>International Interactions (II): Empirical and...</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                   publisher                        type  \\\n0                    harvard  ORGANIZATIONS_INSTITUTIONS   \n1                        pan                    JOURNALS   \n2       intertransferuschina           RESEARCH_PROJECTS   \n3                   BLS-PNAS           RESEARCH_PROJECTS   \n4  internationalinteractions                    JOURNALS   \n\n                                                name  \n0                                  Harvard Dataverse  \n1                       Political Analysis Dataverse  \n2  Behind Parent-Child Relationship and Intergene...  \n3                         Bechtel Liesch Scheve PNAS  \n4  International Interactions (II): Empirical and...  "
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from StringIO import StringIO\n",
    "dj = pd.read_csv(StringIO(all_journals), sep='\\t', names=['publisher', 'type', 'name'])\n",
    "\n",
    "dj.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "2002"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "288"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dj)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged = pd.merge(metadata, dj, on='publisher')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "1998"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(merged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>doi</th>\n      <th>publicationDate</th>\n      <th>publisher</th>\n      <th>subject</th>\n      <th>type</th>\n      <th>name</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>doi:10.7910/DVN/U3QJQZ</td>\n      <td>2020</td>\n      <td>harvard</td>\n      <td>Medicine; Health and Life Sciences</td>\n      <td>ORGANIZATIONS_INSTITUTIONS</td>\n      <td>Harvard Dataverse</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>doi:10.7910/DVN/6KEXM7</td>\n      <td>2020</td>\n      <td>harvard</td>\n      <td>Social Sciences</td>\n      <td>ORGANIZATIONS_INSTITUTIONS</td>\n      <td>Harvard Dataverse</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>doi:10.7910/DVN/EEUTHP</td>\n      <td>2019</td>\n      <td>harvard</td>\n      <td>Social Sciences</td>\n      <td>ORGANIZATIONS_INSTITUTIONS</td>\n      <td>Harvard Dataverse</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>doi:10.7910/DVN/WBI9RT</td>\n      <td>2015</td>\n      <td>harvard</td>\n      <td>Social Sciences</td>\n      <td>ORGANIZATIONS_INSTITUTIONS</td>\n      <td>Harvard Dataverse</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>doi:10.7910/DVN/N0PBQ9</td>\n      <td>2015</td>\n      <td>harvard</td>\n      <td>Social Sciences</td>\n      <td>ORGANIZATIONS_INSTITUTIONS</td>\n      <td>Harvard Dataverse</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "                      doi  publicationDate publisher  \\\n0  doi:10.7910/DVN/U3QJQZ             2020   harvard   \n1  doi:10.7910/DVN/6KEXM7             2020   harvard   \n2  doi:10.7910/DVN/EEUTHP             2019   harvard   \n3  doi:10.7910/DVN/WBI9RT             2015   harvard   \n4  doi:10.7910/DVN/N0PBQ9             2015   harvard   \n\n                              subject                        type  \\\n0  Medicine; Health and Life Sciences  ORGANIZATIONS_INSTITUTIONS   \n1                     Social Sciences  ORGANIZATIONS_INSTITUTIONS   \n2                     Social Sciences  ORGANIZATIONS_INSTITUTIONS   \n3                     Social Sciences  ORGANIZATIONS_INSTITUTIONS   \n4                     Social Sciences  ORGANIZATIONS_INSTITUTIONS   \n\n                name  \n0  Harvard Dataverse  \n1  Harvard Dataverse  \n2  Harvard Dataverse  \n3  Harvard Dataverse  \n4  Harvard Dataverse  "
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged.to_csv('data/metadata_merged.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "3.7.3-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}